林嶔 (Lin, Chin)
Lesson 8
– 請至這裡下載本週的範例資料
dat = read.csv("Example data.csv", header = TRUE, fileEncoding = 'CP950')
head(dat)
## eGFR Disease Survival.time Death Diabetes Cancer SBP DBP
## 1 34.65379 1 0.4771037 0 0 1 121.2353 121.3079
## 2 37.21183 1 3.0704424 0 1 1 122.2000 122.6283
## 3 32.60074 1 0.2607117 1 0 0 118.9136 121.7621
## 4 29.68481 1 NA NA 0 0 118.2212 112.7043
## 5 28.35726 0 0.1681673 1 0 0 116.7469 115.7705
## 6 33.95012 1 1.2238556 0 0 0 119.9936 116.3872
## Education Income
## 1 2 0
## 2 2 0
## 3 0 0
## 4 1 0
## 5 0 0
## 6 1 0
mean(dat$eGFR, na.rm = TRUE)
## [1] 34.14115
sd(dat$eGFR, na.rm = TRUE)
## [1] 7.062506
var(dat$eGFR, na.rm = TRUE)
## [1] 49.87899
median(dat$eGFR, na.rm = TRUE)
## [1] 34.33123
quantile(dat$eGFR, na.rm = TRUE)
## 0% 25% 50% 75% 100%
## 10.00000 29.58404 34.33123 38.44965 60.00000
quantile(dat$eGFR, 0.5, na.rm = TRUE)
## 50%
## 34.33123
quantile(dat$eGFR, 0.95, na.rm = TRUE)
## 95%
## 45.86468
min(dat$eGFR, na.rm = TRUE)
## [1] 10
max(dat$eGFR, na.rm = TRUE)
## [1] 60
table(dat$Cancer)
##
## 0 1
## 576 402
table(dat$Cancer, dat$Diabetes)
##
## 0 1
## 0 468 97
## 1 309 81
tab1 = table(dat$Cancer)
prop.table(tab1)
##
## 0 1
## 0.5889571 0.4110429
tab2 = table(dat$Cancer, dat$Diabetes)
prop.table(tab2)
##
## 0 1
## 0 0.49005236 0.10157068
## 1 0.32356021 0.08481675
prop.table(tab2, 1)
##
## 0 1
## 0 0.8283186 0.1716814
## 1 0.7923077 0.2076923
prop.table(tab2, 2)
##
## 0 1
## 0 0.6023166 0.5449438
## 1 0.3976834 0.4550562
mean(dat[dat[,"Disease"] == 1,]$eGFR, na.rm = TRUE)
## [1] 34.41452
mean(dat[dat[,"Disease"] == 0,]$eGFR, na.rm = TRUE)
## [1] 33.19008
paste(mean(dat[dat[,"Disease"] == 1,]$eGFR, na.rm = TRUE), "±", sd(dat[dat[,"Disease"] == 1,]$eGFR, na.rm = TRUE), sep = "")
## [1] "34.4145222034604±7.1582590897065"
paste0(mean(dat[dat[,"Disease"] == 1,]$eGFR, na.rm = TRUE), "±", sd(dat[dat[,"Disease"] == 1,]$eGFR, na.rm = TRUE))
## [1] "34.4145222034604±7.1582590897065"
m = mean(dat[dat[,"Disease"] == 1,]$eGFR, na.rm = TRUE)
s = sd(dat[dat[,"Disease"] == 1,]$eGFR, na.rm = TRUE)
formatC(m, digits = 3, format = "f")
## [1] "34.415"
formatC(s, digits = 3, format = "f")
## [1] "7.158"
paste0(formatC(m, digits = 3, format = "f"), "±", formatC(s, digits = 3, format = "f"))
## [1] "34.415±7.158"
函數之input必須為一類別變項及一連續變項
函數中必須設定一個變項,讓使用者能決定顯示小數點的位數
3-1. 第一個函數必須輸出在該類別變項為不同類別時,該連續變項之『平均數±標準差』
3-2. 第二個函數必須輸出在該類別變項為不同類別時,該連續變項之『中位數(25百分位-75百分位)』
– 注意:請考慮類別數不只是2的情形,以及類別項目並非為0、1、2、…的狀況
func.1 = function (x, y, dig = 3) {
lvl.x = levels(factor(x))
result = rep(NA, length(lvl.x))
for (i in 1:length(lvl.x)) {
m = mean(y[x==lvl.x[i]], na.rm = TRUE)
s = sd(y[x==lvl.x[i]], na.rm = TRUE)
result[i] = paste0(formatC(m, digits = dig, format = "f"), "±", formatC(s, digits = dig, format = "f"))
}
result
}
func.1(x = dat[,"Disease"], y = dat[,"eGFR"])
## [1] "33.190±6.404" "34.415±7.158"
func.1(x = dat[,"Education"], y = dat[,"SBP"], dig = 1)
## [1] "121.4±10.6" "121.4±10.9" "122.9±10.5"
func.2 = function (x, y, dig = 3) {
lvl.x = levels(factor(x))
result = rep(NA, length(lvl.x))
for (i in 1:length(lvl.x)) {
m = median(y[x==lvl.x[i]], na.rm = TRUE)
q.25 = quantile(y[x==lvl.x[i]], 0.25, na.rm = TRUE)
q.75 = quantile(y[x==lvl.x[i]], 0.75, na.rm = TRUE)
result[i] = paste0(formatC(m, digits = dig, format = "f"), "(", formatC(q.25, digits = dig, format = "f"), "-", formatC(q.75, digits = dig, format = "f"), ")")
}
result
}
func.2(x = dat[,"Disease"], y = dat[,"eGFR"])
## [1] "33.193(28.971-37.238)" "34.663(29.729-38.820)"
func.2(x = dat[,"Education"], y = dat[,"SBP"], dig = 1)
## [1] "121.4(114.8-127.8)" "121.8(114.5-128.9)" "122.8(115.8-129.2)"
\[\hat{y_i} = b_{0} + b_{1}x_i\\\]
\[y_i = b_{0} + b_{1}x_i + \epsilon\\\]
\[\epsilon = y_i - b_{0} - b_{1}x_i\\\]
– 接著再定義找出一組\(b_0\)與\(b_1\)使\(\sum{\epsilon}^2\)最小化:
\[\mbox{min}(\sum{\epsilon}^2) = \mbox{min}(\sum\limits_{i=1}^{n}(y_i - b_{0} - b_{1}x_i)^2)\]
\[ \begin{align} \frac{\partial}{\partial b_0} \sum\limits_{i=1}^{n}(y_i - b_{0} - b_{1}x_i)^2 & = 0 \\ \frac{\partial}{\partial b_1} \sum\limits_{i=1}^{n}(y_i - b_{0} - b_{1}x_i)^2 & = 0 \end{align} \]
– 對\(b_0\)偏微分的求解過程如下:
\[ \begin{align} \frac{\partial}{\partial b_0} \sum\limits_{i=1}^{n}(y_i - b_{0} - b_{1}x_i)^2 &= 2\sum\limits_{i=1}^{n}(y_i - b_{0} - b_{1}x_i) \frac{\partial}{\partial b_0} (y_i - b_{0} - b_{1}x_i)\\ &= 2\sum\limits_{i=1}^{n}-(y_i - b_{0} - b_{1}x_i) \end{align} \]
– 對\(b_1\)偏微分的求解過程如下:
\[ \begin{align} \frac{\partial}{\partial b_1} \sum\limits_{i=1}^{n}(y_i - b_{0} - b_{1}x_i)^2 &= 2\sum\limits_{i=1}^{n}(y_i - b_{0} - b_{1}x_i) \frac{\partial}{\partial b_1} (y_i - b_{0} - b_{1}x_i)\\ &= 2\sum\limits_{i=1}^{n}-x_i(y_i - b_{0} - b_{1}x_i) \end{align} \]
\[ \begin{align} \sum\limits_{i=1}^{n}( b_{0} + b_{1}x_i - y_i) &= 0\\ \sum\limits_{i=1}^{n}( b_{0}x_i + b_{1}x_i^2 - y_ix_i) &= 0 \end{align} \]
\[ \begin{align} b_{1} &= \frac{\sum\limits_{i=1}^{n}y_ix_i-\frac{\sum\limits_{i=1}^{n}y_i\sum\limits_{i=1}^{n}x_i}{n}}{\sum\limits_{i=1}^{n}x_ix_i-\frac{\sum\limits_{i=1}^{n}x_i\sum\limits_{i=1}^{n}x_i}{n}} \\ b_{0} &= \frac{\sum\limits_{i=1}^{n}y_i}{n} - b_{1} \frac{\sum\limits_{i=1}^{n}x_i}{n} \end{align} \]
\[ \begin{align} b_{1} &= \frac{cov(x,y)}{var(x)} \\ b_{0} &= mean(y) - b_{1} \cdot mean(x) \end{align} \]
x = c(1, 2, 3, 4, 5)
y = c(6, 7, 9, 8, 10)
b1 = cov(x, y)/var(x)
b1
## [1] 0.9
b0 = mean(y) - b1 * mean(x)
b0
## [1] 5.3
model = lm(y~x)
model
##
## Call:
## lm(formula = y ~ x)
##
## Coefficients:
## (Intercept) x
## 5.3 0.9
model$coefficients
## (Intercept) x
## 5.3 0.9
lm(SBP~DBP, data = dat)
##
## Call:
## lm(formula = SBP ~ DBP, data = dat)
##
## Coefficients:
## (Intercept) DBP
## 9.9012 0.9189
lm(dat[,"eGFR"]~dat[,"SBP"])
##
## Call:
## lm(formula = dat[, "eGFR"] ~ dat[, "SBP"])
##
## Coefficients:
## (Intercept) dat[, "SBP"]
## -44.2474 0.6443
lm(dat[,"eGFR"] ~ dat[,"SBP"] + dat[,"DBP"])
##
## Call:
## lm(formula = dat[, "eGFR"] ~ dat[, "SBP"] + dat[, "DBP"])
##
## Coefficients:
## (Intercept) dat[, "SBP"] dat[, "DBP"]
## -49.2204 0.4562 0.2293
lm(dat[,"eGFR"] ~ ., data = dat[,c("SBP", "DBP"),drop=FALSE])
##
## Call:
## lm(formula = dat[, "eGFR"] ~ ., data = dat[, c("SBP", "DBP"),
## drop = FALSE])
##
## Coefficients:
## (Intercept) SBP DBP
## -49.2204 0.4562 0.2293
現在我們想要使用SBP以及DBP來預測eGFR,所以希望你能寫出一個自訂函數,讓使用者輸入SBP與DBP的值,並為他計算出eGFR的值。
但是這時候我們遇到一個問題,那就是預測公式鐵定是以這樣的型式表現,因此假設有一個新的個案缺少SBP或DBP其中一項,那他將無法使用這個式子:
\[\hat{eGFR_i} = b_{0} + b_{1}SBP_i + b_{2}DBP_i\\\]
\[\hat{eGFR_i} = b_{0} + b_{1}DBP_i\\\]
– 這裡只展示第二種解法的函數:
pred_func = function (SBP = NA, DBP = NA) {
if (is.na(SBP) & is.na(DBP)) {stop('需要至少輸入一個數值')} else {
var_name = c('SBP', 'DBP')
var_vec = c(1, SBP, DBP)
var_name = var_name[!is.na(var_vec)[-1]]
var_vec = var_vec[!is.na(var_vec)]
model = lm(dat[,"eGFR"] ~ ., data = dat[,var_name,drop=FALSE])
result = model$coefficients %*% var_vec
result
}
}
pred_func(SBP = 100)
## [,1]
## [1,] 20.1858
pred_func(DBP = 100)
## [,1]
## [1,] 20.21421
pred_func(SBP = 100, DBP = 100)
## [,1]
## [1,] 19.32629
– 答案是可以的,對於已經編碼成0、1的變項我們可以直接把他放入線性迴歸,但是0、1、2的變項我們就不能直接放入線性迴歸之中,請想想為什麼?
x = c(0, 1, 2, 1, NA, 2)
coding_x = matrix(c(0, 1, 0, 1, NA, 0,
0, 0, 1, 0, NA, 1), nrow = 6, ncol = 2)
coding_x
## [,1] [,2]
## [1,] 0 0
## [2,] 1 0
## [3,] 0 1
## [4,] 1 0
## [5,] NA NA
## [6,] 0 1
x = c(0, 1, 2, 1, NA, 2)
lvl.cat = levels(factor(x))
n.cat = length(lvl.cat)
coding_x = matrix(0, nrow = length(x), ncol = n.cat - 1)
for (i in 1:(n.cat-1)) {
coding_x[x == lvl.cat[i+1],i] = 1
}
coding_x[is.na(x),] = NA
coding_x
## [,1] [,2]
## [1,] 0 0
## [2,] 1 0
## [3,] 0 1
## [4,] 1 0
## [5,] NA NA
## [6,] 0 1
x = c(0, 1, 2, 1, NA, 2)
coding_x = model.matrix(~as.factor(x))
coding_x
## (Intercept) as.factor(x)1 as.factor(x)2
## 1 1 0 0
## 2 1 1 0
## 3 1 0 1
## 4 1 1 0
## 6 1 0 1
## attr(,"assign")
## [1] 0 1 1
## attr(,"contrasts")
## attr(,"contrasts")$`as.factor(x)`
## [1] "contr.treatment"
new_index = 1:length(x)
new_index[is.na(x)] = NA
new_index[!is.na(new_index)] = 1:sum(!is.na(x))
coding_x = coding_x[new_index,-1]
rownames(coding_x) = 1:nrow(coding_x)
one_hot_coding = function (x) {
coding_x = model.matrix(~as.factor(x))
new_index = 1:length(x)
new_index[is.na(x)] = NA
new_index[!is.na(new_index)] = 1:sum(!is.na(x))
coding_x = coding_x[new_index,-1]
rownames(coding_x) = 1:nrow(coding_x)
coding_x
}
x = c(0, 1, 2, 1, NA, 2)
one_hot_coding(x)
## as.factor(x)1 as.factor(x)2
## 1 0 0
## 2 1 0
## 3 0 1
## 4 1 0
## 5 NA NA
## 6 0 1
coding_Income = one_hot_coding(dat[,"Income"])
coding_Education = one_hot_coding(dat[,"Education"])
dat1 = cbind(dat, coding_Income, coding_Education)
lm(dat[,"eGFR"] ~ ., data = dat1[,c(7:8, 11:14)])
##
## Call:
## lm(formula = dat[, "eGFR"] ~ ., data = dat1[, c(7:8, 11:14)])
##
## Coefficients:
## (Intercept) SBP DBP
## -49.5690 0.4558 0.2305
## `as.factor(x)1` `as.factor(x)2` `as.factor(x)1.1`
## 0.3370 1.2791 0.1097
## `as.factor(x)2.1`
## 0.2518
lm(dat[,"eGFR"] ~ dat[,"SBP"] + dat[,"DBP"] + factor(dat[,"Income"]) + factor(dat[,"Education"]))
##
## Call:
## lm(formula = dat[, "eGFR"] ~ dat[, "SBP"] + dat[, "DBP"] + factor(dat[,
## "Income"]) + factor(dat[, "Education"]))
##
## Coefficients:
## (Intercept) dat[, "SBP"]
## -49.5690 0.4558
## dat[, "DBP"] factor(dat[, "Income"])1
## 0.2305 0.3370
## factor(dat[, "Income"])2 factor(dat[, "Education"])1
## 1.2791 0.1097
## factor(dat[, "Education"])2
## 0.2518
– 同樣的,假設使用者不願意填寫Income或是Education的情形,那就請你用剩餘的資訊進行預測。
– 這裡你就可以發現,假設我們有n個變項可供使用者填答,那他填與不填的所有組合就共有\(2^n-1\)種,因此剛剛的第二種寫法又更為重要了,試著完成他吧!
dat = read.csv("Example data.csv", header = TRUE, fileEncoding = 'CP950')
coding_Income = one_hot_coding(dat[,"Income"])
coding_Education = one_hot_coding(dat[,"Education"])
dat1 = cbind(dat, coding_Income, coding_Education)
colnames(dat1)[11:14] = c('Income1', 'Income2', 'Education1', 'Education2')
pred_func = function (SBP = NA, DBP = NA, Income = NA, Education = NA) {
if (is.na(SBP) & is.na(DBP) & is.na(Income) & is.na(Education)) {stop('需要至少輸入一個數值')} else {
if (is.na(Income)) {
coding_Income = c(NA, NA)
} else if (Income == 0) {
coding_Income = c(0, 0)
} else if (Income == 1) {
coding_Income = c(1, 0)
} else if (Income == 2) {
coding_Income = c(0, 1)
}
if (is.na(Education)) {
coding_Education = c(NA, NA)
} else if (Education == 0) {
coding_Education = c(0, 0)
} else if (Education == 1) {
coding_Education = c(1, 0)
} else if (Education == 2) {
coding_Education = c(0, 1)
}
var_name = c('SBP', 'DBP', 'Income1', 'Income2', 'Education1', 'Education2')
var_vec = c(1, SBP, DBP, coding_Income, coding_Education)
var_name = var_name[!is.na(var_vec)[-1]]
var_vec = var_vec[!is.na(var_vec)]
model = lm(dat[,"eGFR"] ~ ., data = dat1[,var_name,drop=FALSE])
result = model$coefficients %*% var_vec
result
}
}
pred_func(SBP = 100, DBP = 100, Income = 1)
## [,1]
## [1,] 19.44494
pred_func(SBP = 100, DBP = 100, Education = 2)
## [,1]
## [1,] 19.53829
pred_func(SBP = 100, DBP = 100, Income = 2, Education = 0)
## [,1]
## [1,] 20.33689
資料分析是R語言與其他程式語言相比之下強勢的地方,你會發現像是我們推導很久的線性迴歸就有內建函數可以幫助我們進行運算,除此之外幾乎所有常見的統計分析方法在R裡面都具有內建函數能夠使用。
由於課程設計的限制,我們並沒有完整的介紹R語言的統計分析功能,有興趣的同學可以參考碩士班的課程第5至第7課。
現在你應該已經有能力獨自從蒐集資料至資料分析的整個過程,程式語言將能協助你建立pipeline,讓你能隨時看到最新結果!